#ifndef QHOEFFDING
#define QHOEFFDING

#include "Env.h"
#include <vector>
#include <tuple>

class Qlearning_gen {
public:
    /**
     * @brief Constructor for the Qlearning_gen class.
     * @param mdp A reference to the MDP environment.
     * @param c The exploration constant for the UCB bonus.
     * @param total_episodes The total number of episodes to run the learning process.
     */
    Qlearning_gen(FiniteStateFiniteActionMDP& mdp, float c, int total_episodes);

    /**
     * @brief Runs the main learning loop for the specified number of episodes.
     * @return A tuple containing:
     *         - std::vector<float>: The optimal value function (V*) for the initial state distribution.
     *         - std::vector<std::vector<std::vector<float>>>: The optimal Q-function (Q*).
     *         - std::vector<float>: The value function of the learned policy from the last episode.
     *         - std::vector<std::vector<std::vector<float>>>: The final learned global Q-function.
     *         - std::vector<float>: A vector of the raw regret (gap) for each episode.
     */
    std::tuple<
        std::vector<float>,
        std::vector<std::vector<std::vector<float>>>,
        std::vector<float>,
        std::vector<std::vector<std::vector<float>>>,
        std::vector<float>
    > learn();

private:
    // Member Variables
    FiniteStateFiniteActionMDP& mdp;
    float c;
    int total_episodes;

    std::vector<std::vector<float>> V_func;
    std::vector<std::vector<std::vector<float>>> V_next;
    std::vector<std::vector<std::vector<float>>> global_Q;
    std::vector<std::vector<std::vector<int>>> N;
    std::vector<std::vector<std::vector<int>>> n;

    std::vector<float> regret;
    std::vector<float> raw_gap;

    // Private Helper Methods
    
    /**
     * @brief Executes a single episode of interaction with the environment.
     * @return A pair containing the rewards collected during the episode and the initial state.
     */
    std::pair<std::vector<std::vector<std::vector<float>>>, int> run_episode();

    /**
     * @brief Determines the current policy (actions) based on the global Q-values.
     * @return A 3D vector representing the deterministic policy.
     */
    std::vector<std::vector<std::vector<float>>> choose_action();

    /**
     * @brief Updates the global Q-function based on the rewards from an episode.
     * @param rewards The rewards collected during the episode.
     */
    void update_Q(const std::vector<std::vector<std::vector<float>>>& rewards);
};

#endif // QLEARNING_GEN_H